nf-core spring survey 2025

Introduction

This notebook processes the results of the nf-core community general survey from February/March 2025 (date freeze: 2025-03-20).

Preparation

Load relevant libraries

Show the code
library(tidyverse)
library(janitor)
library(patchwork)
library(ggalluvial)
## Try word clouds? https://cran.r-project.org/web/packages/ggwordcloud/vignettes/ggwordcloud.html

Load relevant data files, and make the column names easier to parse grammatically.

Show the code
## Raw responses
data_responses_raw <- read_csv("data/nf-core_community_survey_-_Feb_2025_-_nf-core_community_survey_2025.csv") |> clean_names()

## Free text response columns plus columns containing manually tagged 'keyword' by @jfy133
data_responses_tagged <- read_csv(
  "data/nf-core_community_survey_-_Feb_2025_-_2025-03-16-freeze-annotated_(JFY).csv"
) |> clean_names()

## Free text response columns plus columns containing manually tagged 'keyword' by @jfy133
data_responses_tagged_2 <- read_csv(
  "data/nf-core_community_survey_-_Feb_2025_-_2025-03-16-freeze-annotated_(FB).csv"
) |> clean_names()

Demographic information

Number of responses

The first bit of information we can find out is the number responses overall.

Show the code
nr_slack_members <- 11640 ## As of March 18 2025
nr_responses <- data_responses_raw |> count()

We had 209, which we assume roughly equates to the number of people who responded (but as it is anonymous, we cannot guarantee so some people responded twice - however our survey software does try to re-use previous responses if made via the same browser).

As of March 18th 2025, we had 11640 people on slack. This means we have an approximate response rate of 1.8%.

Geographic Location Analysis

First we would like to understand where the respondents of the survey are from.

First we parse and clean-up the response data (fixing some country names to match the map data), and then calculate the number of responses per country.

Show the code
library(ggplot2)
library(dplyr)

data_map <- map_data("world")

data_responses_country <- data_responses_raw |>
  select(what_country_are_you_based_in) |>
  mutate(
    what_country_are_you_based_in = if_else(
      what_country_are_you_based_in == 'United Kingdom',
      'UK',
      what_country_are_you_based_in
    ),
    what_country_are_you_based_in = 
      if_else(is.na(what_country_are_you_based_in),
        'No response',
        what_country_are_you_based_in)
  ) |>
  group_by(what_country_are_you_based_in) |>
  summarise(nr_response_per_country = n()) |>
  arrange(desc(nr_response_per_country)) |>
  mutate(
    what_country_are_you_based_in = fct_reorder(what_country_are_you_based_in, nr_response_per_country),
    what_country_are_you_based_in = fct_relevel(what_country_are_you_based_in, 'No response')
  )

nr_country_noresponse <- data_responses_country |> filter(what_country_are_you_based_in == 'No response') |> pull(nr_response_per_country)

We can then visualise these counts in a bar plot to see which countries we got more or less responses from.

Show the code
ggplot(
  data_responses_country,
  aes(y = what_country_are_you_based_in, x = nr_response_per_country)
) +
  geom_col(fill = "#24b064") +
  theme_minimal() +
  labs(title = "Location of respondees", 
     subtitle = "Number of respondees per INSDC country category. NA: no response to question.", 
     x = "Count",
     y = "Country", 
     caption = paste("Based on", nr_responses, "responses to the spring 2025 nf-core community survey")
  )

To better visualise, we can plot a map with each country that has at least a single response filled with a colour.

Show the code
## Merge data with world map
data_map_rendering <- data_map |>
  left_join(data_responses_country |> rename(region = what_country_are_you_based_in)) |>
  mutate(country_fill = ifelse(!is.na(nr_response_per_country), "#24b064", "white"))

# Use scale_fill to set correct colours
ggplot(data_map_rendering,
       aes(long, lat, group = group, fill = country_fill)) +
  geom_polygon(colour = "gray") +
  scale_fill_identity() +
  theme_minimal() +
  labs(title = "Countries with a response",
     subtitle = paste("Countries from which at least one response was received.", nr_country_noresponse ,"people did not respond to question"), 
     x = "Longitude",
     y = "Latitude", 
     caption = paste("Based on", nr_responses, "responses to the spring 2025 nf-core community survey")
  )

nf-core community member type

We also wanted to understand what type of nf-core community member was responding - a user, developer, or both.

For this we take the raw responses, then clean up the community member type, as we had an ‘other’ column which appears to have been not that useful as most people put in responses that still correspond to our three types. We can summarise this by counting the number of categories in the column.

Show the code
data_responses_respondertype <- data_responses_raw |>
  select(do_you_consider_yourself_to_be_an_nf_core) |>
  mutate(
    do_you_consider_yourself_to_be_an_nf_core = case_match(
      do_you_consider_yourself_to_be_an_nf_core,
      "Developer, User" ~ "User, Developer",
      "User, Exploring" ~ "User",
      "non nf-core nf dev" ~ "Developer",
      "Newcomer" ~ "User",
      "Newbie" ~ "User",
      "Developer, User, Manager" ~ "User, Developer",
      "Developer, Maintainer" ~ "Developer",
      "develop in nextflow" ~ "Developer",
      "both" ~ "User, Developer",
      NA ~ 'No response',
      .default = do_you_consider_yourself_to_be_an_nf_core
    )
  ) |>
  group_by(do_you_consider_yourself_to_be_an_nf_core) |>
  summarise(nr = n()) |>
  arrange(desc(nr)) |>
  mutate(
    do_you_consider_yourself_to_be_an_nf_core = fct_reorder(do_you_consider_yourself_to_be_an_nf_core, nr),
    do_you_consider_yourself_to_be_an_nf_core = fct_relevel(do_you_consider_yourself_to_be_an_nf_core, 'No response')
  )

As with the countries, the best way to plot this is via a bar chart.

Show the code
ggplot(
  data_responses_respondertype,
  aes(y = do_you_consider_yourself_to_be_an_nf_core, x = nr)
) +
  geom_col(fill = "#24b064") +
  theme_minimal() +
  labs(title = "Type of respondee", 
     subtitle = "Users run pipelines, Developers code pipelines, Snakemake developer is a survey troll", 
     x = "Score",
     y = "Count", 
     caption = paste("Based on", nr_responses, "responses to the spring 2025 nf-core community survey")
  )

Overall happiness with nf-core

Another question was ‘how like would you recommend nf-core to others’, to use as a general gauge to the happiness of responders to nf-core as a whole.

We can select the column.

Show the code
data_recommend_score <- data_responses_raw |>
  select(do_you_consider_yourself_to_be_an_nf_core, how_likely_are_you_to_recommend_nf_core_to_others) |> 
   mutate(
      do_you_consider_yourself_to_be_an_nf_core = case_match(
        do_you_consider_yourself_to_be_an_nf_core,
        "Developer, User" ~ "User, Developer",
        "User, Exploring" ~ "User",
        "non nf-core nf dev" ~ "Developer",
        "Newcomer" ~ "User",
        "Newbie" ~ "User",
        "Developer, User, Manager" ~ "User, Developer",
        "Developer, Maintainer" ~ "Developer",
        "develop in nextflow" ~ "Developer",
        "both" ~ "User, Developer",
        NA ~ 'No response',
        .default = do_you_consider_yourself_to_be_an_nf_core
      )
    ) |>
  filter(!is.na(how_likely_are_you_to_recommend_nf_core_to_others), do_you_consider_yourself_to_be_an_nf_core %in% c("User", "Developer", "User, Developer")) 

And then use the inbuilt group and counting functionality to plot a histogram with the a count of responses per score level.

Show the code
ggplot(data_recommend_score,
       aes(how_likely_are_you_to_recommend_nf_core_to_others)) +
  geom_histogram(bins = 10, fill = "#24b064") +
  theme_minimal() +
  scale_x_continuous(breaks = seq(0, 10, by = 1)) +
  labs(title = "Likelihood of recommending nf-core to others", 
     subtitle = "0: not at all, 5: extremely likely", 
     x = "Score",
     y = "Count", 
     caption = paste("Based on", nr_responses, "responses to the spring 2025 nf-core community survey")
  ) +
  facet_wrap(vars(do_you_consider_yourself_to_be_an_nf_core),, ncol = 1)

We can also calculate the ‘net promoter score’ score, which subtracts the percentage ‘detractor’ responders from ‘promoter’ responses.

The higher the value, the happier the responders are overall. Typically negative values indicates there are more ‘detractors’ than promotors (therefore a negative view), whereas 60-100 is a strong responder satisfaction.

Show the code
enps_percentages = data_responses_raw |>
  select(how_likely_are_you_to_recommend_nf_core_to_others) |>
  mutate(
    group = case_when(
      how_likely_are_you_to_recommend_nf_core_to_others <= 6 ~ 'Detractor',
      how_likely_are_you_to_recommend_nf_core_to_others >= 9 ~ 'Promoter',
      .default = 'Neutral'
    )
  ) |> 
  group_by(group) |> 
  summarise(
    percent = (n() / nr_responses) * 100
  ) |> unnest(cols = c(group, percent))

enps_score = round(enps_percentages |> filter(group == 'Promoter') |> pull(n) - enps_percentages |> filter(group == 'Detractor') |> pull(n))

Based on the eNPS we have a score of 54 which is pretty close to a very strong ‘happiness’ but we can do better.

Confidence with nf-core pipelines

Finally to understand if responders were newcomers versus long time members, we asked how confident people felt using nf-core pipelines, with the assumption of more confident people will have been around for a longer time.

Show the code
data_confidence_score <- data_responses_raw |>
  select(do_you_consider_yourself_to_be_an_nf_core, how_confident_are_you_working_with_nf_core_pipelines) |> 
   mutate(
      do_you_consider_yourself_to_be_an_nf_core = case_match(
        do_you_consider_yourself_to_be_an_nf_core,
        "Developer, User" ~ "User, Developer",
        "User, Exploring" ~ "User",
        "non nf-core nf dev" ~ "Developer",
        "Newcomer" ~ "User",
        "Newbie" ~ "User",
        "Developer, User, Manager" ~ "User, Developer",
        "Developer, Maintainer" ~ "Developer",
        "develop in nextflow" ~ "Developer",
        "both" ~ "User, Developer",
        NA ~ 'No response',
        .default = do_you_consider_yourself_to_be_an_nf_core
      )
    ) |>
  filter(!is.na(how_confident_are_you_working_with_nf_core_pipelines), do_you_consider_yourself_to_be_an_nf_core %in% c("User", "Developer", "User, Developer"))

And generate another histogram.

Show the code
ggplot(
  data_confidence_score,
  aes(how_confident_are_you_working_with_nf_core_pipelines)
) +
  geom_histogram(bins = 5, fill = "#24b064") +
  theme_minimal() +
  labs(title = "Level of confidence in using or developing nf-core pipelines", 
     subtitle = "1: newcomer, 5: advanced", 
     x = "Confidence score",
     y = "Count", 
     caption = paste("Based on", nr_responses, "responses to the spring 2025 nf-core community survey")
) +
  facet_wrap(vars(do_you_consider_yourself_to_be_an_nf_core), ncol = 1)

Feedback - positive and to improve upon (global)

The main questions of the survey - what are you most happy with, what have been your biggest difficulties, and additional feedback/requests - were all free text.

To get a rough overview of the types of responses - i.e., which topics were responders had the biggest issues with - we read through each response and for each of the three questions we assigned one or more tags corresponding to different ‘topics’ the response fell under.

Data preparation

First we need to load the tags for each response column, split the column tag column per tag, then count the number of times occurs. We do a simple count as a responder may have given multiple bits of feedback.

Show the code
data_responses_tagged_positive <- data_responses_tagged |> select(positive_tags)
data_responses_tagged_improve <- data_responses_tagged |> select(improvement_tags)
data_responses_tagged_request <- data_responses_tagged |> select(request_tags)
data_responses_tagged_2_positive <- data_responses_tagged_2 |> select(positive_tags)
data_responses_tagged_2_improve <- data_responses_tagged_2 |> select(improvement_tags)
data_responses_tagged_2_request <- data_responses_tagged_2 |> select(request_tags)

summarise_tag_results <- function(in_list) {
  in_list  |> 
    str_split(',') |> 
    unlist() |> 
    str_trim(side = 'both') |> 
    as_tibble_col(column_name = 'topic') |> 
    group_by(topic) |> 
    summarise(count = n()) |> 
    arrange(desc(count)) |> 
    mutate(
      topic = fct_reorder(topic, count)
    ) |> 
    filter(topic != 'No feedback')
    
}

data_tags_positive <- summarise_tag_results(data_responses_tagged_positive$positive_tags)
data_tags_improve <- summarise_tag_results(data_responses_tagged_improve$improvement_tags)
data_tags_request <- summarise_tag_results(data_responses_tagged_request$request_tags)

data_tags2_positive <- summarise_tag_results(data_responses_tagged_2_positive$positive_tags)
data_tags2_improve <- summarise_tag_results(data_responses_tagged_2_improve$improvement_tags)
data_tags2_request <- summarise_tag_results(data_responses_tagged_2_request$request_tags)

Distribution of feedback topics

We can then plot bar charts with the number of responses hitting a particular topic. Note that remove responses that had no feedback.

Show the code
plot_topics_summary <- function(in_data, title, subtitle) {
  ggplot(in_data, aes(y = topic, x = count)) +
    geom_col(fill = "#24b064") +
    theme_minimal() +
    labs(title = title, 
         subtitle = subtitle,
         x = "Count", 
         y = "Topic", 
         caption = paste("Based on", nr_responses, "responses to the spring 2025 nf-core community survey")
    ) +
    scale_x_continuous(breaks = seq(0, 70, by = 10), limits = c(0,80))
}

(plot_topics_summary(data_tags_positive, "Topics the respondees are happy about", "Reviewer 1 interpretation") + plot_topics_summary(data_tags2_positive, "Topics the respondees are happy about", "Reviewer 2 interpretation")) /
(plot_topics_summary(data_tags_improve, "Topics the respondees think should be improved upon", "Reviewer 1 interpretation") + plot_topics_summary(data_tags2_improve, "Topics the respondees think should be improved upon", "Reviewer 2 interpretation")) /
(plot_topics_summary(data_tags_request, "Topics of specific requests from respondees", "Reviewer 1 interpretation") + plot_topics_summary(data_tags2_request, "Topics of specific requests from respondees", "Reviewer 2 interpretation"))

Feedback - positive and to improve upon (per responder type)

We can also refine the above information by splitting the feedback for both User-only and Developer-only responders (we exclude people who consider themselves both for simplicity)

Data preparation

First we need to load the tags for each response column, split the column tag column per tag, then count the number of times occurs. We do a simple count as a responder may have given multiple bits of feedback.

Show the code
data_responses_tagged_positive <- data_responses_tagged |> select(do_you_consider_yourself_to_be_an_nf_core, positive_tags) |> rename(tags = positive_tags)
data_responses_tagged_improve <- data_responses_tagged |> select(do_you_consider_yourself_to_be_an_nf_core, improvement_tags) |> rename(tags = improvement_tags)
data_responses_tagged_request <- data_responses_tagged |> select(do_you_consider_yourself_to_be_an_nf_core, request_tags) |> rename(tags = request_tags)
data_responses_tagged_2_positive <- data_responses_tagged_2 |> select(do_you_consider_yourself_to_be_an_nf_core, positive_tags) |> rename(tags = positive_tags)
data_responses_tagged_2_improve <- data_responses_tagged_2 |> select(do_you_consider_yourself_to_be_an_nf_core, improvement_tags) |> rename(tags = improvement_tags)
data_responses_tagged_2_request <- data_responses_tagged_2 |> select(do_you_consider_yourself_to_be_an_nf_core, request_tags) |> rename(tags = request_tags)

summarise_tag_results_bytype <- function(in_list, tag_type) {
  in_list  |> 
    filter(do_you_consider_yourself_to_be_an_nf_core %in% c('User', 'Developer')) |> 
    mutate(tag_type = tag_type, tags = str_split(tags, ',')) |> 
    unnest() |> 
    mutate(tags = str_trim(tags, side = 'both')) |> 
    rename(responder_type = do_you_consider_yourself_to_be_an_nf_core, topic = tags) |> 
    group_by(responder_type, topic) |> 
    summarise(count = n()) |> 
    arrange(desc(count)) |> 
    mutate(
      topic = fct_reorder(topic, count)
    ) |> 
    filter(topic != 'No feedback')
    
}

data_tags_positive <- summarise_tag_results_bytype(data_responses_tagged_positive, 'positive_tags')
data_tags_improve <- summarise_tag_results_bytype(data_responses_tagged_improve, 'improvement_tags')
data_tags_request <- summarise_tag_results_bytype(data_responses_tagged_request, 'request_tags')

data_tags2_positive <- summarise_tag_results_bytype(data_responses_tagged_2_positive, 'positive_tags')
data_tags2_improve <- summarise_tag_results_bytype(data_responses_tagged_2_improve, 'improvement_tags')
data_tags2_request <- summarise_tag_results_bytype(data_responses_tagged_2_request, 'request_tags')

Distribution of feedback topics

We can then plot bar charts with the number of responses hitting a particular topic. Note that remove responses that had no feedback.

Show the code
plot_topics_summary <- function(in_data, title, subtitle, responder) {
  data_for_plot <- in_data |> 
    filter(responder_type == responder) |>
    arrange(desc(count)) |> 
    mutate(
      topic = fct_reorder(topic, count)
    )

  ggplot(data_for_plot, aes(y = topic, x = count)) +
    geom_col(fill = "#24b064") +
    theme_minimal() +
    labs(title = title, 
         subtitle = subtitle,
         x = "Count", 
         y = "Topic", 
         caption = paste("Based on", nr_responses, "responses to the spring 2025 nf-core community survey")
    ) +
    scale_x_continuous(breaks = seq(0, 70, by = 10), limits = c(0,80))
}

plot_topics_summary(data_tags_positive, "Topics the developer-only  respondees are happy about", "Reviewer 1 interpretation", 'Developer') + plot_topics_summary(data_tags2_positive, "Topics the developer-only  respondees are happy about", "Reviewer 2 interpretation", 'Developer')

Show the code
plot_topics_summary(data_tags_positive, "Topics the user-only respondees are happy about", "Reviewer 1 interpretation", 'User') + plot_topics_summary(data_tags2_positive, "Topics the user-only respondees are happy about", "Reviewer 2 interpretation", 'User')

Show the code
plot_topics_summary(data_tags_improve, "Topics the developer-only  respondees think should be improved upon", "Reviewer 1 interpretation", 'Developer') + plot_topics_summary(data_tags2_improve, "Topics the developer-only  respondees think should be improved upon", "Reviewer 2 interpretation", 'Developer')

Show the code
plot_topics_summary(data_tags_improve, "Topics the user-only  respondees think should be improved upon", "Reviewer 1 interpretation", 'User') + plot_topics_summary(data_tags2_improve, "Topics the user-only  respondees think should be improved upon", "Reviewer 2 interpretation", 'User')

Show the code
plot_topics_summary(data_tags_request, "Topics of specific requests from developer-only  respondees", "Reviewer 1 interpretation", 'Developer') + plot_topics_summary(data_tags2_request, "Topics of specific requests from developer-only  respondees", "Reviewer 2 interpretation", 'Developer')

Show the code
plot_topics_summary(data_tags_request, "Topics of specific requests from user-only respondees", "Reviewer 1 interpretation", 'User') + plot_topics_summary(data_tags2_request, "Topics of specific requests from user-only respondees", "Reviewer 2 interpretation", 'User')